{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Full Mode: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学\n",
      "Default Mode: 我/ 来到/ 北京/ 清华大学\n",
      "他, 来到, 了, 网易, 杭研, 大厦\n",
      "小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, ，, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造\n",
      "李小福/是/创新办/主任/也/是/云计算/方面/的/专家/;/ /什么/是/八一双鹿/\n",
      "/例如/我/输入/一个/带/“/韩玉赏鉴/”/的/标题/，/在/自定义/词库/中/也/增加/了/此/词为/N/类/\n",
      "/「/台中/」/正確/應該/不會/被/切開/。/mac/上/可/分出/「/石墨烯/」/；/此時/又/可以/分出/來/凱特琳/了/。\n"
     ]
    }
   ],
   "source": [
    "seg_list = jieba.cut(\"我来到北京清华大学\", cut_all=True)\n",
    "print(\"Full Mode: \" + \"/ \".join(seg_list))  # 全模式\n",
    "\n",
    "seg_list = jieba.cut(\"我来到北京清华大学\", cut_all=False)\n",
    "print(\"Default Mode: \" + \"/ \".join(seg_list))  # 精确模式\n",
    "\n",
    "seg_list = jieba.cut(\"他来到了网易杭研大厦\")  # 默认是精确模式\n",
    "print(\", \".join(seg_list))\n",
    "\n",
    "seg_list = jieba.cut_for_search(\"小明硕士毕业于中国科学院计算所，后在日本京都大学深造\")  # 搜索引擎模式\n",
    "print(\", \".join(seg_list))\n",
    "'''\n",
    "jieba.cut 方法接受三个输入参数: 需要分词的字符串；cut_all 参数用来控制是否采用全模式；HMM 参数用来控制是否使用 HMM 模型\n",
    "jieba.cut_for_search 方法接受两个参数：需要分词的字符串；是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词，粒度比较细,待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意：不建议直接输入 GBK 字符串，可能无法预料地错误解码成 UTF-8\n",
    "jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator，可以使用 for 循环来获得分词后得到的每一个词语(unicode)，或者用\n",
    "jieba.lcut 以及 jieba.lcut_for_search 直接返回 list\n",
    "jieba.Tokenizer(dictionary=DEFAULT_DICT) 新建自定义分词器，可用于同时使用不同词典。jieba.dt 为默认分词器，所有全局分词相关函数都是该分词器的映射。\n",
    "'''\n",
    "\n",
    "# 使用自定义词典\n",
    "user_dict = \"\"\"云计算 5\n",
    "李小福 2 nr\n",
    "创新办 3 i\n",
    "easy_install 3 eng\n",
    "好用 300\n",
    "韩玉赏鉴 3 nz\n",
    "八一双鹿 3 nz\n",
    "台中\n",
    "凱特琳 nz\n",
    "Edu Trust认证 2000\n",
    "\"\"\"\n",
    "with open('userdict.txt','w',encoding='utf-8') as f:\n",
    "    f.write(user_dict)\n",
    "# 加载自定义词典\n",
    "jieba.load_userdict(\"userdict.txt\")\n",
    "# 调整词典\n",
    "jieba.add_word('石墨烯')\n",
    "jieba.add_word('凱特琳')\n",
    "jieba.del_word('自定义词')\n",
    "test_sent = (\n",
    "\"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\\n\"\n",
    "\"例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类\\n\"\n",
    "\"「台中」正確應該不會被切開。mac上可分出「石墨烯」；此時又可以分出來凱特琳了。\"\n",
    ")\n",
    "words = jieba.cut(test_sent)\n",
    "words = jieba.cut(test_sent)\n",
    "print('/'.join(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 关键词提取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "采购,应答,文件,电子,系统,中国移动,供应商,公告,四川,分公司\n",
      "采购,应答,文件,电子,系统,中国移动,供应商,公告,四川,分公司,代理,招标网,项目,天府,比选,递交,新区,2019,机构,投标\n",
      " TextRank\n",
      "吉林 1.0\n",
      "欧亚 0.9966893354178172\n",
      "置业 0.6434360313092776\n",
      "实现 0.5898606692859626\n",
      "收入 0.43677859947991454\n",
      "增资 0.4099900531283276\n",
      "子公司 0.35678295947672795\n",
      "城市 0.34971383667403655\n",
      "商业 0.34817220716026936\n",
      "业务 0.3092230992619838\n",
      "在建 0.3077929164033088\n",
      "营业 0.3035777049319588\n",
      "全资 0.303540981053475\n",
      "综合体 0.29580869172394825\n",
      "注册资本 0.29000519464085045\n",
      "有限公司 0.2807830798576574\n",
      "零售 0.27883620861218145\n",
      "百货 0.2781657628445476\n",
      "开发 0.2693488779295851\n",
      "经营范围 0.2642762173558316\n"
     ]
    }
   ],
   "source": [
    "import jieba.analyse\n",
    "topK = 10\n",
    "\n",
    "# 基于 TF-IDF 算法的关键词抽取\n",
    "\"\"\"\n",
    "jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=())\n",
    "    sentence 为待提取的文本\n",
    "    topK 为返回几个 TF/IDF 权重最大的关键词，默认值为 20\n",
    "    withWeight 为是否一并返回关键词权重值，默认值为 False\n",
    "    allowPOS 仅包括指定词性的词，默认值为空，即不筛选\n",
    "\n",
    "jieba.analyse.TFIDF(idf_path=None) 新建 TFIDF 实例，idf_path 为 IDF 频率文件\n",
    "\"\"\"\n",
    "content = open('raw.txt', 'rb').read()\n",
    "tags = jieba.analyse.extract_tags(content, topK=topK)\n",
    "print(\",\".join(tags))\n",
    "\n",
    "# 可以先设置tfidf文件，导入之后再提取\n",
    "jieba.analyse.set_idf_path(\"./extra_dict/idf.txt.big\");\n",
    "tags = jieba.analyse.extract_tags(content, topK=20)\n",
    "print(\",\".join(tags))\n",
    "\n",
    "# 设置idf文件和stopword文件\n",
    "jieba.analyse.set_stop_words(\"./extra_dict/stop_words.txt\")\n",
    "jieba.analyse.set_idf_path(\"./extra_dict/idf.txt.big\");\n",
    "tags = jieba.analyse.extract_tags(content, topK=topK)\n",
    "\n",
    "# 关键词带weight\n",
    "tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=True)\n",
    "\n",
    "\n",
    "# 基于 TextRank 算法的关键词抽取\n",
    "'''\n",
    "jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 直接使用，接口相同，注意默认过滤词性。\n",
    "jieba.analyse.TextRank() 新建自定义 TextRank 实例\n",
    "'''\n",
    "s = \"此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。\"\n",
    "print(' TextRank')\n",
    "for x, w in jieba.analyse.textrank(s, withWeight=True):\n",
    "    print('%s %s' % (x, w))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 词性标注"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "我 r\n",
      "爱 v\n",
      "北京 ns\n",
      "天安门 ns\n"
     ]
    }
   ],
   "source": [
    "'''\n",
    "jieba.posseg.POSTokenizer(tokenizer=None) 新建自定义分词器，tokenizer 参数可指定内部使用的 \n",
    "jieba.Tokenizer 分词器。\n",
    "jieba.posseg.dt 为默认词性标注分词器。\n",
    "'''\n",
    "import jieba.posseg as pseg\n",
    "words = pseg.cut(\"我爱北京天安门\")\n",
    "for word, flag in words:\n",
    "    print('%s %s' % (word, flag))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "word 永和\t\t start: 0 \t\t end:2\n",
      "word 服装\t\t start: 2 \t\t end:4\n",
      "word 饰品\t\t start: 4 \t\t end:6\n",
      "word 有限公司\t\t start: 6 \t\t end:10\n",
      "****************************************\n",
      "word 永和\t\t start: 0 \t\t end:2\n",
      "word 服装\t\t start: 2 \t\t end:4\n",
      "word 饰品\t\t start: 4 \t\t end:6\n",
      "word 有限\t\t start: 6 \t\t end:8\n",
      "word 公司\t\t start: 8 \t\t end:10\n",
      "word 有限公司\t\t start: 6 \t\t end:10\n"
     ]
    }
   ],
   "source": [
    "result = jieba.tokenize(u'永和服装饰品有限公司')\n",
    "for tk in result:\n",
    "    print(\"word %s\\t\\t start: %d \\t\\t end:%d\" % (tk[0],tk[1],tk[2]))\n",
    "print('*'*40)\n",
    "result = jieba.tokenize(u'永和服装饰品有限公司', mode='search')\n",
    "for tk in result:\n",
    "    print(\"word %s\\t\\t start: %d \\t\\t end:%d\" % (tk[0],tk[1],tk[2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting whoosh\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ba/19/24d0f1f454a2c1eb689ca28d2f178db81e5024f42d82729a4ff6771155cf/Whoosh-2.7.4-py2.py3-none-any.whl (468kB)\n",
      "\u001b[K     |████████████████████████████████| 471kB 196kB/s eta 0:00:01\n",
      "\u001b[?25hInstalling collected packages: whoosh\n",
      "Successfully installed whoosh-2.7.4\n"
     ]
    }
   ],
   "source": [
    "# Whoosh搜索引擎\n",
    "'''\n",
    "纯Python的全文搜索库\n",
    "Whoosh是索引文本及搜索文本的类和函数库。\n",
    "它能让你开发出一个个性化的经典搜索引擎。\n",
    "'''\n",
    "!pip install whoosh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "result of  水果世博园\n",
      "买<b class=\"match term0\">水果</b>然后来<b class=\"match term1\">世博园</b>\n",
      "==========\n",
      "result of  你\n",
      "second one <b class=\"match term0\">你</b> 中文测试中文 is even more interesting\n",
      "==========\n",
      "result of  first\n",
      "<b class=\"match term0\">first</b> document we’ve added\n",
      "==========\n",
      "result of  中文\n",
      "second one 你 <b class=\"match term0\">中文</b>测试<b class=\"match term0\">中文</b> is even more interesting\n",
      "==========\n",
      "result of  交换机\n",
      "干事每月经过下属科室都要亲口交代24口<b class=\"match term0\">交换机</b>等技术性器件的安装工作\n",
      "==========\n",
      "result of  交换\n",
      "咱俩<b class=\"match term0\">交换</b>一下吧\n",
      "干事每月经过下属科室都要亲口交代24口<b class=\"match term0\">交换</b>机等技术性器件的安装工作\n",
      "==========\n",
      "我\n",
      "好\n",
      "朋友\n",
      "是\n",
      "李明\n",
      "我\n",
      "爱\n",
      "北京\n",
      "天安\n",
      "天安门\n",
      "ibm\n",
      "microsoft\n",
      "dream\n",
      "intetest\n",
      "interest\n",
      "me\n",
      "lot\n"
     ]
    }
   ],
   "source": [
    "\n",
    "from __future__ import unicode_literals\n",
    "import sys,os\n",
    "sys.path.append(\"../\")\n",
    "from whoosh.index import create_in,open_dir\n",
    "from whoosh.fields import *\n",
    "from whoosh.qparser import QueryParser\n",
    "\n",
    "from jieba.analyse.analyzer import ChineseAnalyzer\n",
    "\n",
    "analyzer = ChineseAnalyzer()\n",
    "\n",
    "schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))\n",
    "if not os.path.exists(\"tmp\"):\n",
    "    os.mkdir(\"tmp\")\n",
    "\n",
    "ix = create_in(\"tmp\", schema) # for create new index\n",
    "#ix = open_dir(\"tmp\") # for read only\n",
    "writer = ix.writer()\n",
    "\n",
    "writer.add_document(\n",
    "    title=\"document1\",\n",
    "    path=\"/a\",\n",
    "    content=\"This is the first document we’ve added!\"\n",
    ")\n",
    "\n",
    "writer.add_document(\n",
    "    title=\"document2\",\n",
    "    path=\"/b\",\n",
    "    content=\"The second one 你 中文测试中文 is even more interesting! 吃水果\"\n",
    ")\n",
    "\n",
    "writer.add_document(\n",
    "    title=\"document3\",\n",
    "    path=\"/c\",\n",
    "    content=\"买水果然后来世博园。\"\n",
    ")\n",
    "\n",
    "writer.add_document(\n",
    "    title=\"document4\",\n",
    "    path=\"/c\",\n",
    "    content=\"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作\"\n",
    ")\n",
    "\n",
    "writer.add_document(\n",
    "    title=\"document4\",\n",
    "    path=\"/c\",\n",
    "    content=\"咱俩交换一下吧。\"\n",
    ")\n",
    "\n",
    "writer.commit()\n",
    "searcher = ix.searcher()\n",
    "parser = QueryParser(\"content\", schema=ix.schema)\n",
    "\n",
    "for keyword in (\"水果世博园\",\"你\",\"first\",\"中文\",\"交换机\",\"交换\"):\n",
    "    print(\"result of \",keyword)\n",
    "    q = parser.parse(keyword)\n",
    "    results = searcher.search(q)\n",
    "    for hit in results:\n",
    "        print(hit.highlights(\"content\"))\n",
    "    print(\"=\"*10)\n",
    "\n",
    "for t in analyzer(\"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot\"):\n",
    "    print(t.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
